OrphaNet Rare Disease association and analysis

Cross-scale network signatures of rare diseases

The structure of Orphanet Rare Disease Ontology was queried and processed using R interface of the Ontology Lookup Service (https://lgatto.github.io/rols/index.html). A number of calculation per-computed for further analyses on this section was performed in source/Orphanet_annotate_genes_to_ancestors.R.

# load direct gene association
orpha_gene_onset_df <- readRDS("../cache/orpha_gene_onset_df.RDS")

# disease gene association with roots
orphanet_gene_association <- read_tsv("../data/orphaNet_disease_gene_association_with_roots.tsv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   orphaID = col_character(),
##   label = col_character(),
##   genes = col_character(),
##   n_genes = col_double(),
##   description = col_character(),
##   roots = col_character()
## )
# disease gene association at group level
source("../functions/readdata_functions.R")
gene_disease_orpha = process_disease_genes_data("../data/table_disease_gene_assoc_orphanet_genetic.tsv", 1, 2000)
## reading disease gene association data
## [1] "read 28 diseases, of total 3593 associated genes."
#source("../source/read_orphanet_gene_association_data.R")
gene_disease_orpha = gene_disease_orpha$disgene_df

# Modify and merge data
orpha_gene_display_df <- orphanet_gene_association %>%
  dplyr::filter(n_genes > 0) %>%
  mutate(ID = as.double(str_remove(orphaID, "Orphanet:"))) 

DT::datatable(orpha_gene_display_df[,c("ID", "label", "n_genes", "genes")] ,
                extensions = 'Buttons',
                options = list(dom = 'Blfrtip',
                               buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
                               lengthMenu = list(c(10,25,50,-1),
                                                 c(10,25,50,"All"))))

Rare diseases are scarcely annotated, and most disease terms (2686 out of 3771) are only associated with one gene. Network-based measurements for individual diseases are unfeasible and grouping of the terms for higher level association are necessary.

# from orphanet_mapping_top_branch
gene_per_disease = orpha_gene_onset_df %>% 
  dplyr::filter(!is.na(gene)) %>% 
  count(orphaID)

gene_per_disease_count = gene_per_disease %>% 
  mutate(group = cut(n, breaks = c(0:10, 100), labels = c(1:10, "> 10"))) %>% 
  count(group)

p = ggplot(gene_per_disease_count, aes(x = group, y =n)) + geom_col() +
  ggtitle("Most orphanet diseases are immediately associated with one gene") +
  theme_minimal() + ylab("Number of diseases") + xlab("Number of genes per disease")#+ scale_y_log10()

plotly::ggplotly(p)
pacman::p_load(cowplot)
#gene_per_disease_group = gene_disease_orpha %>% 
 # mutate(group = cut(n_genes, breaks = c(seq(0,100,20), seq(200,1000, 200), Inf))) %>% count(group)

#ggplot(gene_per_disease_group, aes(x = group, y =n)) + geom_col() +
#  ggtitle("Most orphanet diseases are immediately associated with one gene") +
#  theme_minimal() + ylab("Number of diseases") + xlab("Number of genes per disease")#+ scale_y_log10()

# number shift
gene_per_disease_group = gene_disease_orpha %>% 
  mutate(disease = "Grouped", n = N) %>% 
  select(disease, n)

gene_per_disease = gene_per_disease %>% 
  mutate(disease = "Individual")

gene_per_disease_both = gene_per_disease %>% 
  select(disease, n) %>%
  bind_rows(., gene_per_disease_group) %>% 
  dplyr::filter(n>0) %>%
  # relevel factor
  mutate(disease = factor(disease, levels = c("Individual","Grouped")))

# add break values for plotting on log scale on x axis
breakvals = c(0:9, seq(10,90,10), seq(100,1000,100), 2000)

gene_per_disease_both_count = gene_per_disease_both %>% 
  mutate(group = cut(n, breaks = breakvals, include.lowest = T, labels = breakvals[-1])) %>%
  group_by(disease, group) %>% 
  summarise(n = n()) %>% 
  mutate(prop = n/sum(n)) %>% 
  full_join(., tibble(group = as_factor(breakvals[-1])))
## `summarise()` has grouped output by 'disease'. You can override using the `.groups` argument.
## Joining, by = "group"
gene_per_disease_both_count$disease[is.na(gene_per_disease_both_count$disease)] = "Grouped"
gene_per_disease_both_count$n[is.na(gene_per_disease_both_count$n)] = 0
# plot
#ggplot(gene_per_disease_both_count, aes(y = prop, x = group)) + geom_col() +  theme_minimal() +facet_grid(disease ~ .)

p = ggplot(gene_per_disease_both_count, aes(y = n, x = group)) + 
  geom_col() +  
  facet_grid(disease ~ ., scales = "free") +
  #scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
  scale_x_discrete(breaks = c(1,10,100,1000))+
  geom_vline(xintercept = which(levels(gene_per_disease_both_count$group)==20), linetype = "dashed", col = "red") + theme_cowplot() + xlab("Genes per disease term") + ylab("Number of terms")

p

#ggsave("../Figs/orphanet_individual_vs_grouped_diseases_bar.pdf", p, width = 3*1.5, height = 2*1.5)

Based on the plot above, accumulating gene-disease association for descendant disease terms of ‘Rare genetic disease’ ( “Orphanet:98053”) resulting in physiologically distinct disease groups where the majority (26 out of 28) groups are associated with sufficient amount of genes for module detection (\(n=20\)).

The disease gene association can be found in data/table_disease_gene_assoc_orphanet_genetic.tsv. The summary of disease groups and all associated genes are shown below.

rmarkdown::paged_table(gene_disease_orpha %>% select(name, N) %>% arrange(-N)) 

Omitting disease groups with fewer than 20 associated genes, the number of terms (out of 3771) and genes excluded from further analyses include:

# roots that are removed
removed_roots <- gene_disease_orpha %>% filter(N<20) %>% pull(name)

# table for removed terms
removed_disease_terms <- orphanet_gene_association %>%
  mutate(n_roots = str_count(roots, ";")+1,
         removed_roots = grepl(removed_roots[1], roots) + grepl(removed_roots[2], roots),
         remained_roots = n_roots - removed_roots) %>%
  filter(remained_roots == 0) %>%
  select(orphaID, label, genes, roots)

removed_genes <- sapply(removed_disease_terms$genes, function(x) str_split(x, ";")) %>% unlist %>% unique()

knitr::kable(removed_disease_terms)
orphaID label genes roots
Orphanet:199241 Pulmonary capillary hemangiomatosis EIF2AK4 Rare genetic respiratory disease
Orphanet:210122 Congenital alveolar capillary dysplasia FOXF1 Rare genetic respiratory disease
Orphanet:217566 Chronic respiratory distress with surfactant metabolism deficiency SFTPC Rare genetic respiratory disease
Orphanet:217563 Neonatal acute respiratory distress due to SP-B deficiency SFTPB Rare genetic respiratory disease
Orphanet:440402 Interstitial lung disease due to ABCA3 deficiency ABCA3 Rare genetic respiratory disease
Orphanet:440392 Interstitial lung disease due to SP-C deficiency SFTPC Rare genetic respiratory disease
Orphanet:444092 Autoimmune interstitial lung disease-arthritis syndrome COPA Rare genetic respiratory disease
Orphanet:31837 Pulmonary venoocclusive disease BMPR2;EIF2AK4 Rare genetic respiratory disease
Orphanet:100051 Hereditary angioedema type 2 SERPING1 Serpinopathy
Orphanet:100050 Hereditary angioedema type 1 SERPING1 Serpinopathy

There are 10 disease terms whose associated genes are not associated with any other disease groups, and hence these 8 genes are omitted.

Tree map for ontology representation of rare diseases

# download the association
library(RColorBrewer)

orphanet_gene_association_unique_root <- orphanet_gene_association %>%
  separate_rows(., roots, sep = ";", convert = T) %>% dplyr::filter(!is.na(roots)) %>% 
  mutate(roots = as.factor(roots))

## take a function to allow modifying alpha values
# Add an alpha value to a colour
add.alpha <- function(col=NULL, alpha=1){
  if(missing(col))
    stop("Please provide a vector of colours.")
  apply(sapply(col, col2rgb)/255, 2, 
                     function(x) 
                       rgb(x[1], x[2], x[3], alpha=alpha))  
}

# define colours for all disease groups
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nrow(gene_disease_orpha))

# add id 1-28
gene_disease_orpha$id = 1:nrow(gene_disease_orpha)

# add colour with corresponding alpha values to each disease group
gene_disease_orpha_mod <- gene_disease_orpha %>% 
  rowwise() %>%
  mutate(col =mycolors[id])
  #mutate(col = add.alpha(mycolors[id], N/max(gene_disease_orpha$N)))

gene_disease_orpha_mod$root = "Orphanet"

# load voronoi treemap package
if(!"voronoiTreemap" %in% rownames(installed.packages())){
  pacman::p_load_gh("https://github.com/uRosConf/voronoiTreemap")
} else{
  library(voronoiTreemap)
}


gene_disease_orpha_mod$plotlab = str_remove_all( gene_disease_orpha_mod$name, "Rare |genetic | disease| syndrome| disorder")
onto_json <- vt_export_json(vt_input_from_df(gene_disease_orpha_mod, hierachyVar0 = "root", hierachyVar1 = "name", hierachyVar2 = "name", colorVar = "col", weightVar = "N", labelVar = "plotlab"))

vt_d3(onto_json)